************************************************************************************************
* Purpose: prepare the raw NOVAX contact data from the danish municipalities:
* define types of contacts in NOVAX data
* afterwards: define who are mothers, fathers and children in NOVAX contact data, which is our main data. 
* Result: PNRs of mother, father and child.
* Finally we end up with pnrs from munic*year with good coverage of the NOVAX records
************************************************************************************************

do "D:\Data\workdata\708296\Project - Mother groups and mental health\Do\global.do" 

* all contacts and their types
* all contacts in the data 
* use raw contact data from NOVAX - i.e. one spell is one specific nurse contact/registration of any sort for either kid mom or dad

foreach var of newlist a b c{
use "$raw\besog_kontakt`var'", clear
gen source="`var'"
save "$work\besog_kontakt`var'", replace
}
use "$raw\kont_holbaek_aeroe_fanoe", clear
gen source="holbaek"
save "$work\besog_kontakt_holbaek", replace


use "$work\besog_kontakta", clear
append using "$work\besog_kontaktb" 
append using "$work\besog_kontaktc"
append using "$work\besog_kontakt_holbaek"
rename cpr pnr

* Date of the record
gen date = date(Dato, "DMY")
format date %td
drop Dato 
order pnr date
sort pnr date
compress

* Missing pnrs
gen help=substr(pnr, -12,1) 
drop if help==""
drop help

*rename KommuneID muni
compress
************************************************************************************************
save "$work\all_contacts.dta", replace // 5.5 mio records...
************************************************************************************************
* classify types of contacts 

drop if strpos( Besøgstype, "samtykke" ) > 0
drop if strpos( Besøgstype, "Samtykke" ) > 0
drop if strpos( Besøgstype, "Sekretær" ) > 0
drop if strpos( Besøgstype, "BabyLab" ) > 0
drop if strpos( Besøgstype, "DSI" ) > 0
drop if strpos( Besøgstype, "Notat fra anden" ) > 0
drop if strpos( Besøgstype, "Resum" ) > 0
drop if strpos( Besøgstype, "Tilføjelse" ) > 0
drop if strpos( Besøgstype, "afbud" ) > 0
drop if strpos( Besøgstype, "udeblive" ) > 0
drop if strpos( Besøgstype, "Udeblivel" ) > 0
drop if strpos( Besøgstype, "udmeldt" ) > 0
drop if strpos( Besøgstype, "afmeldt" ) > 0
drop if strpos( Besøgstype, "Ikke" ) > 0
drop if strpos( Besøgstype, "Udmeldt" ) > 0
drop if strpos( Besøgstype, "Afbud" ) > 0
drop if strpos( Besøgstype, "Administrativ" ) > 0
drop if strpos( Besøgstype, "ikke" ) > 0
drop if strpos( Besøgstype, "Aflyst" ) > 0
drop if strpos( Besøgstype, "aflyst" ) > 0
drop if strpos( Besøgstype, "Udeblevet" ) > 0
drop if strpos( Besøgstype, "Aflysning" ) > 0
drop if strpos( Besøgstype, "Aktindsigt" ) > 0
drop if strpos( Besøgstype, "Afviser" ) > 0
drop if strpos( Besøgstype, "fravalgt" ) > 0
drop if strpos( Besøgstype, "aflyser" ) > 0
drop if strpos( Besøgstype, "Forgæves" ) > 0
drop if strpos( Besøgstype, "Journal" ) > 0
drop if strpos( Besøgstype, "journal" ) > 0
drop if strpos( Besøgstype, "KMD" ) > 0
drop if strpos( Besøgstype, "Kollegial" ) > 0
drop if strpos( Besøgstype, "orientering" ) > 0
drop if strpos( Besøgstype, "Tillæg" ) > 0
drop if strpos( Besøgstype, "sendt" ) > 0
drop if strpos( Besøgstype, "3½ år" ) > 0
drop if strpos( Besøgstype, "Indgående" ) > 0
drop if strpos( Besøgstype, "1 1/2" ) > 0
drop if strpos( Besøgstype, "1½" ) > 0
drop if strpos( Besøgstype, "2½ års" ) > 0
drop if strpos( Besøgstype, "2 1/2 års" ) > 0
drop if strpos( Besøgstype, "2½-3" ) > 0
drop if strpos( Besøgstype, "3-års" ) > 0
drop if strpos( Besøgstype, "3 års" ) > 0
drop if strpos( Besøgstype, "3-3" ) > 0
drop if strpos( Besøgstype, "3½-" ) > 0
drop if strpos( Besøgstype, " Admministativ" ) > 0
drop if strpos( Besøgstype, "Afrikaturen" ) > 0
drop if strpos( Besøgstype, "1½ års" ) > 0
drop if strpos( Besøgstype, "2 ½" ) > 0
drop if strpos( Besøgstype, "Utilsigtet" ) > 0
drop if strpos( Besøgstype, "Fravalgt" ) > 0

gen school=1 if Modul=="Skole"
drop if school==1

gen visit=.
replace visit=1 if strpos( Besøgstype, "esøg" ) > 0 // across Moduls - so can be school...
replace visit=1 if strpos( Besøgstype, "ESØG" ) > 0 // across Moduls - so can be school...
replace visit=1 if strpos( Besøgstype, "esog" ) > 0 // across Moduls - so can be school...

replace visit=1 if Klassifi=="Besøg"
replace visit=1 if Klassifi=="Andre besøg"
replace visit=1 if Klassifi=="Barselsbesøg"
replace visit=1 if Klassifi=="Behov"
replace visit=1 if Klassifi=="Etablering"
replace visit=. if strpos( Besøgstype, "ikke" ) > 0 

gen behov=1 if visit==1 & Klassi=="Behov"
replace behov=1 if visit==1 & Klassi=="Andre besøg"

gen grav=.
replace grav=1 if  strpos( Besøgstype, "graviditet" ) > 0
replace grav=1 if strpos( Besøgstype, "Graviditet" ) > 0
replace grav=1 if  Klassifikation=="Graviditetsbesøg"
replace grav=1 if  Klassifikation=="Teamsamtale Gravid"
replace grav=1 if Klassifikation=="Teamsamtale gravid"
replace grav=1 if Klassifikation=="Teamsamtale gravide"
replace grav=. if  strpos( Besøgstype, "Barselsbesøg" ) > 0
replace grav=. if  strpos( Besøgstype, "Etableringsbesøg" ) > 0

gen consultation=1 if strpos( Besøgstype, "onsultation" ) > 0
gen phone=1 if strpos( Besøgstype, "elefon" ) > 0

gen lettermail=1 if strpos( Besøgstype, "mail" ) > 0
replace  lettermail=1 if strpos( Besøgstype, "Mail" ) > 0
replace  lettermail=1 if strpos( Besøgstype, "brev" ) > 0
replace  lettermail=1 if strpos( Besøgstype, "Brev" ) > 0
replace  lettermail=1 if strpos( Besøgstype, "orrespondance" ) > 0
replace  lettermail=1 if strpos( Besøgstype, "SMS" ) > 0

gen group=1 if strpos( Besøgstype, "gruppe" ) > 0
replace  group=1 if strpos( Besøgstype, "FIV" ) > 0
replace  group=1 if strpos( Besøgstype, "Forældreklasse" ) > 0
replace  group=1 if strpos( Besøgstype, "Gruppe" ) > 0
replace  group=1 if strpos( Besøgstype, "mødegang" ) > 0
replace  group=1 if strpos( Besøgstype, "Hold" ) > 0
replace  group=1 if strpos( Besøgstype, "iværksætter" ) > 0
replace  group=1 if strpos( Besøgstype, "Mødegang" ) > 0
replace group=1 if strpos( Besøgstype, "gruppe" ) > 0

replace group=. if strpos( Besøgstype, "afbud" ) > 0
replace group=. if strpos( Besøgstype, "udeblivelse" ) > 0
replace group=. if strpos( Besøgstype, "Udeblivelse" ) > 0
replace group=. if strpos( Besøgstype, "udmeldt" ) > 0
replace group=. if strpos( Besøgstype, "Ikke" ) > 0
replace group=. if strpos( Besøgstype, "Udmeldt" ) > 0 

gen henvist=strpos( Besøgstype, "envisning" ) > 0
replace henvist=1 if strpos( Besøgstype, "envist" ) > 0
replace henvist=1 if strpos( Besøgstype, "Henv" ) > 0
replace henvist=1 if strpos( Besøgstype, "Fysio" ) > 0
replace henvist=1 if strpos( Besøgstype, "Ergo" ) > 0

gen underret=1 if strpos( Besøgstype, "Underret" ) > 0

gen openhouse=.
replace openhouse=1 if strpos( Besøgstype, "Åbent hus" ) > 0
replace openhouse=1 if strpos( Besøgstype, "Åben" ) > 0
replace openhouse=1 if strpos( Besøgstype, "Åbent Hus" ) > 0
replace openhouse=1 if strpos( Besøgstype, "åbent hus" ) > 0
replace openhouse=1 if strpos( Besøgstype, "Åbent" ) > 0

gen type=.
replace type=1 if visit==1
replace type=1 if consultation==1
replace type=1 if phone==1
replace type=1 if lettermail==1
replace type=1 if henvis==1
replace type=1 if openhouse==1
replace type=1 if group==1
replace type=1 if underret==1
replace type=1 if grav==1

replace type=0 if type==.

gen other=(type!=1)

foreach var of varlist school visit behov cons phone letter group henvis underret openhouse grav other{
replace `var'=0 if `var'==.
}
drop type
compress
*******************************************************************************
* all contacts - types
save "$work\contacts_types.dta", replace
*******************************************************************************


*******************************************************************************
* use contact data to extract all relevant pnrs: all pnrs in NOVAX
*******************************************************************************

* keep first spell per pnr - to find unique children/moms/dads
sort pnr date
by pnr: keep if _n==1 

gen year_firstcontact=year(date) // if moms/dads have multipe kids: this is the firstcontact for them.

save "$work\unique_firstcontacts.dta", replace

*******************************************************************************************************************
keep pnr
save "$work\novaxpnr_formerge.dta",replace 

* these are all pnrs defined in NOVAX - so in the contact data we do not know whether pnr is a mom/dad or kid. we have to do some merges to 
* figure that out
* the pnr_formerge file has a list of all pnrs used in NOVAX so we can figuer out who these ppl are
*******************************************************************************************************************


********************************************************************************
* now we use mfr to figure out whether pnr in NOVAX identifies a mom, dad or kid
* We create datasets for each type of match: child, mother or father
* so here pnr is either kid mom or dad from NOVAX data
* for moms and dads the data is wide: the pnr is the mom/dad and all kids are defined by pnrb's
********************************************************************************

use "$work\mfr_kids", clear // start with GRUND/mfr sample of kids here
merge 1:1 pnr using "$work\novaxpnr_formerge.dta", keep(3)
keep pnr mfr_yob mfr_dob id_*
gen kid=1
save "$work\mergedkids", replace 

* moms and all their kids -wide format cprb1,2,3...
use "$work\themoms_wide", clear // start with grund/mfr

merge 1:1 pnr using "$work\novaxpnr_formerge.dta", keep(3) nogen
gen mom=1
save "$work\mergedmoms", replace // 

* dads and all their kids
use "$work\thedads_wide", clear // start with grund/mfr 

merge 1:1 pnr using "$work\novaxpnr_formerge.dta", keep(3) nogen
gen dad=1
save "$work\mergeddads", replace 

* now we append the three datasets of kids, moms and dads - so we have a list of pnrs in NOVAX but now we know that these pnrs are moms dads kids who have contacts 

use "$work\mergeddads", clear 
append using "$work\mergedmoms"
append using "$work\mergedkids"
keep pnr* kid mom dad  mfr_*  // pnr is either mom dad or kid in this dataset...

* make pnr unique
sort pnr mfr_yob
bys pnr: gen help1=_N
by pnr: egen mother=max(mom)
by pnr: egen father=max(dad)
drop if mother==1 & father==1
drop mother father 
drop help1
label var pnr "PNR of mom, dad or kid"


sort pnr mfr_yob
bys pnr: gen help1=_n
drop if help1>1 // still some mom/dads who also have kids
drop help1

order pnr mom dad kid
save "$work\sample_novax.dta", replace // 392,302

// this is the sample of unique pnrs (and indicators for mom dad kid) that we can use to merge other NOVAX subsets (eg breastfeeding data), assess coverage and so on
